##### R CODE TO REPLICATE THE BALANCE CHECKS INCLUDED IN 
##### Adrian Lucardi, "The Effect of District Magnitude on Electoral Outcomes. Evidence from Two Natural Experiments in Argentina," forthcoming in British Journal of Political Science.

## Loading the required R packages. Make sure to have them installed before proceeding. In order to install a package called X, write install.packages("X")
library (ggplot2)
library (plyr)
library (xtable)

## display options
options (digits=4, scipen=8, show.signif.stars=FALSE)
(digits <- 3) ## number of decimals to use

## working directory -> change this to the directory where your have your data
setwd ("/Users/adrianlucardi/Dropbox/Replication files/")


###### DOWNLOADING AND PROCESSING THE DATA

## Main dataset
baseFull <- read.csv ("Argentine legislative elections 1983-2015.csv", sep=",", header=TRUE)

## Additional data
demo01 <- read.csv ("Balance and Placebos/Argentina Variables 01 Dependent B.csv", sep=",", header=TRUE)
demo02 <- read.csv ("Balance and Placebos/Argentina Variables 04 Control.csv", sep=",", header=TRUE)
demo04 <- read.csv ("Balance and Placebos/Culture 1914 Census.csv", sep=",", header=TRUE)
repr01 <- read.csv ("Balance and Placebos/Argentina Variables 05 Instrumental.csv", sep=",", header=TRUE)
ele83 <- read.csv ("Balance and Placebos/Argentine 1983 elections.csv", sep=",", header=TRUE)
eleDN <- read.csv ("Balance and Placebos/Argentine elections 1983-2015.csv", sep=",", header=TRUE)

# updating some variables
demo02$area <- with (demo02, area / 1000)
demo02$regCuyo <- with (demo02, ifelse (region=="ARG_CUYO", 1, 0))
demo02$regNEA <- with (demo02, ifelse (region=="ARG_NEA", 1, 0))
demo02$regNOA <- with (demo02, ifelse (region=="ARG_NOA", 1, 0))
demo02$regPampa <- with (demo02, ifelse (region=="ARG_PAMPA", 1, 0))
demo02$regPatag <- with (demo02, ifelse (region=="ARG_PATAGONIA", 1, 0))
demo02$founding <- with (demo02, ifelse (district=="BUE" | district=="CAT" | district=="CBA" | district=="CTES" | district=="ER" | district=="JUJ" | district=="LR" | district=="MZA" | district=="SFE" | district=="SGO" | district=="SJ" | district=="SL" | district=="STA" | district=="TUC", 1, 0)) ##to identify the country's 14 initial provinces
demo04$total <- with (demo04, natives + foreigners)
demo04$foreignSh <- with (demo04, foreigners / total)*100
repr01$seat.sh.dif.upper <- repr01$seat.sh.dif.upper*100
repr01$seat.sh.dif.lower <- repr01$seat.sh.dif.lower*100
repr01$seat.sh.dif.avg <- repr01$seat.sh.dif.avg*100

# adding data for national deputy elections
ele83 <- rbind.fill (ele83, eleDN[eleDN$date=="1983-10-30",c ("district", "date", "elType", "list", "votes", "voteSh", "seats", "partyPJ", "partyUCR", "party3rd")])
ele83 <- ele83[ele83$elType!="senloc",]

ele83$date <- NULL
ele83$district <- factor (ele83$district)
ele83$elType <- factor (ele83$elType)
ele83$list <- factor (ele83$list)
ele83$id <- with (ele83, factor (paste (district, elType, sep="_")))
ele83$seats <- with (ele83, ifelse (is.na (seats), 0, seats))
ele83$partyPJ <- with (ele83, ifelse (is.na (partyPJ), 0, partyPJ))
ele83$partyUCR <- with (ele83, ifelse (is.na (partyUCR), 0, partyUCR))
ele83$party3rd <- with (ele83, ifelse (is.na (party3rd), 0, party3rd))

# calculating the vote totals by district-elType
ele83b <- ele83[ele83$list=="VOTOS POSITIVOS" | ele83$list=="VOTOS VALIDOS",c ("id", "district", "elType", "votes")]
colnames (ele83b)[grep ("votes", colnames (ele83b))] <- "total"
ele83a <- ele83[ele83$list!="VOTOS POSITIVOS" & ele83$list!="TOTAL VOTANTES" & ele83$list!="VOTOS EN BLANCO" & ele83$list!="VOTOS BLANCOS" & ele83$list!="VOTOS ANULADOS" & ele83$list!="VOTOS NULOS",] ## we only want to keep valid votes
ele83a <- merge (ele83a, ele83b[,c ("id", "total")], by="id", all.x=TRUE, all.y=TRUE)
ele83a$voteSh <- with (ele83a, votes / total)*100

ele83c <- as.data.frame (with (ele83a, names (by (voteSh, id, max))))
colnames (ele83c)[1] <- "id"
ele83c$district <- factor (with (ele83c, unlist (strsplit (as.character (id), split="_")))[seq (1, 2*nrow (ele83c), by=2)])
ele83c$elType <- factor (with (ele83c, unlist (strsplit (as.character (id), split="_")))[seq (2, 2*nrow (ele83c), by=2)])
ele83c$id <- NULL
ele83c$votePJ <- with (ele83a, by (voteSh*partyPJ, id, sum)[])
ele83c$voteUCR <- with (ele83a, by (voteSh*partyUCR, id, sum)[])
ele83c$vote3rd <- with (ele83a, by (voteSh*party3rd, id, sum)[])
ele83c$vote12 <- with (ele83c, votePJ + voteUCR)

ele83d <- as.data.frame (names (with (ele83c, by (votePJ, district, colMeans))))
colnames (ele83d) <- "district"
ele83d$votePJ <- with (ele83c, by (votePJ, district, colMeans))[] ## for some reason, mean () returns NA's
ele83d$voteUCR <- with (ele83c, by (voteUCR, district, colMeans))[]
ele83d$vote3rd <- with (ele83c, by (vote3rd, district, colMeans))[]
ele83d$vote12 <- with (ele83c, by (vote12, district, colMeans))[]

# checking
summary (round (with (ele83d, vote12-votePJ-voteUCR), 4)) ## all 0's. Good
summary (round (with (ele83d, votePJ+voteUCR+vote3rd), 4))  ## all lower than 100. Good


## merging everything
base83 <- as.data.frame (factor (sort (unique (demo02$district))))
colnames (base83) <- "district"
base83$id80 <- factor (paste ("ARG", base83$district, "1980", sep="_"))
base83$id83 <- factor (paste ("ARG", base83$district, "1983", sep="_"))

base83 <- merge (base83, demo01[,c ("id", "nbi", "imr")], by.x="id80", by.y="id", all.x=TRUE, all.y=FALSE)
base83 <- merge (base83, demo02[,c ("id", "population", "area", "density", "geo_latitude", "geo_elevationmt", "geo_oceanaccess", "geo_trop_pct", "geo_pre_avg", "geo_tmp_avg", "geo_wnd_avg", "geo_oilandgas", "regCuyo", "regNEA", "regNOA", "regPampa", "regPatag", "founding")], by.x="id83", by.y="id", all.x=TRUE, all.y=FALSE)
# base83 <- merge (base83, demo03[,c ("id", "revenues.head", "rev.own", "rev.royalties", "rev.grant.auto", "rev.grant.discret")], by.x="id83", by.y="id", all.x=TRUE, all.y=FALSE)
base83 <- merge (base83, demo04[,c ("province", "foreignSh")], by.x="district", by.y="province", all.x=TRUE, all.y=FALSE)
base83 <- merge (base83, repr01[,c ("id", "ratio.seat.pop.lower", "seat.sh.dif.lower")], by.x="id83", by.y="id", all.x=TRUE, all.y=FALSE)
base83 <- merge (base83, ele83d, by.x="district", by.y="district", all.x=TRUE, all.y=FALSE)
base83 <- merge (base83, baseFull[baseFull$year==1983,c ("district", "revenues.head", "rev.own", "rev.royalties", "rev.grant.auto", "rev.grant.discret", "unemployment", "delegation", "nListSeats", "enps", "dispGall", "nLists", "enpv", "vote02sum", "largeMidterm", "odd")], by.x="district", by.y="district", all.x=T, all.y=F)

base83$population.log <- with (base83, log (population))
base83$density.log <- with (base83, log (density))
base83$revenues.head.log <- with (base83, log (revenues.head))
base83$geo_trop_pct <- with (base83, geo_trop_pct*100)
base83$nbi <- with (base83, nbi*100)

# reordering and eliminating some variables
base83 <- base83[base83$odd==1,]
base83$tdf <- with (base83, ifelse (district=="TDF", 1, 0))
base83 <- base83[order (base83$tdf, decreasing=F),] ## to send Tierra del Fuego to the end
base83$id83 <- NULL
base83$id80 <- NULL
base83$odd <- NULL
base83$tdf <- NULL
base83$district <- factor (base83$district)

# Identifying the variables for which we will perform the balance checks
vNames <- c (
  
  ## Dependent variables
  "nLists"
  , "enpv"
  , "vote02sum"
  , "nListSeats"
  , "enps"
  , "dispGall"
  
  ## Pseudo-dependent variables
  , "revenues.head.log"
  , "rev.own"
  , "rev.royalties"
  , "rev.grant.auto"
  , "rev.grant.discret"
  , "imr"
  
  ## Electoral outcomes, 1983 election
  , "votePJ"
  , "voteUCR"
  , "vote12"
  , "vote3rd"
  
  ## Demographic
  , "population.log"
  , "density.log"
  , "nbi"
  
  ## Geographic & historic
  , "area"
  , "geo_latitude"
  , "geo_elevationmt"
  , "geo_oceanaccess"
  , "geo_trop_pct"
  , "geo_pre_avg"
  , "geo_tmp_avg"
  , "geo_wnd_avg"
  , "geo_oilandgas"
  , "regCuyo"
  , "regNEA"
  , "regNOA"
  , "regPampa"
  , "regPatag"
  , "founding"
  , "foreignSh"
  
  ## Political representation
  , "delegation"
  , "ratio.seat.pop.lower"
  , "seat.sh.dif.lower"
)
vNames2b <- c (
  "\\# lists running"
  , "ENPV"
  , "vote first two"
  , "\\# lists seats"
  , "ENPS"
  , "Gallagher index"
  
  , "revenues per capita (log)"
  , "\\% own revenues"
  , "\\% royalties"
  , "\\% automatic transfers"
  , "\\% discretionary transfers"
  , "infant mortality rate (per 1,000)"
  
  , "\\% vote PJ (1983)"
  , "\\% vote UCR (1983)"
  , "\\% vote PJ+UCR (1983)"
  , "vote third party"
  
  , "population (1980) (log)"
  , "population density (1980) (log)"
  , "\\% poor (1980)"
  
  , "area (1,000s km2)"
  , "latitude"
  , "elevation"
  , "ocean access"
  , "\\% tropical"
  , "average precipitation"
  , "average temperature"
  , "average wind speed"
  , "\\# oil and gas fields"
  , "region: Cuyo"
  , "region: Northeast"
  , "region: Northwest"
  , "region: Pampa"
  , "region: Patagonia"
  , "founding province"
  , "\\% foreign population (1914)"
  
  , "delegation size (1983)"
  , "seat/population ratio (1983)"
  , "\\% seats - \\% population (1983)"
)
vNames2 <- paste ("\\emph{", vNames2b, "}", sep="") ## to display variable names in Italics

# creating the dataset for reporting the balance
balance <- as.data.frame (cbind (
  colMeans (base83[base83$largeMidterm==1,vNames], na.rm=TRUE)
  , colMeans (base83[base83$largeMidterm==0,vNames], na.rm=TRUE)
  , rep (NA, length (vNames))
  , rep (NA, length (vNames))
  , 1:length (vNames) ))
colnames (balance) <- c ("meanLM", "meanLC", "dif", "pval", "idn.y")
balance$dif <- with (balance, meanLM - meanLC)
round (balance, 3); dim (balance) ## checking balance for 71 variables


## Simulating alternative asignation of provinces to every group

set.seed (4927) ## for reproducibility
sims <- 50000 ## we will multiply this by 2
scenarios <- as.data.frame (matrix (NA, nrow=19, ncol=sims*2))
row.names (scenarios) <- as.character (base83$district)
colnames (scenarios) <- c (paste ("sce", 1:(sims*2), sep=""))

system.time (
  for (s in 1:sims){
    
    ## 8 districts with larger magnitude in midterm years
    scenarios[,s] <- c (sample (x=c (rep (1, 8), rep (0, 10)), size=18, replace=FALSE), 0) ##TDF always goes with the 0s
    
    ## 10 districts with larger magnitude in concurrent years
    scenarios[,sims+s] <- c (sample (x=c (rep (1, 10), rep (0, 8)), size=18, replace=FALSE), 1) ##now TDF always goes with the 1s
  }
) ## 983 seconds (16 minutes)


## estimating the differences between simulated values
simsLM <- as.data.frame (matrix (NA, nrow=(sims*2), ncol=length (vNames)))
colnames (simsLM) <- vNames
simsLC <- simsLM

system.time (
  for (v in 1:length (simsLM)){
    simsLM[,v] <- (t (scenarios) %*% base83[,vNames][,v]) / colSums (scenarios, na.rm=TRUE)
    simsLC[,v] <- ((1 - t (scenarios)) %*% base83[,vNames][,v]) / colSums (1 - scenarios, na.rm=TRUE)
  }
) ## 2129 seconds (35 minutes)
simsDif <- simsLM - simsLC

# computing the p-values
for (v in 1:length (vNames)){
  balance$pval[v] <- sum (abs (balance$dif[v]) <= abs (simsDif[,v])) / (sims*2)
}
round (balance, 3)

balance[balance$pval <= .15,] ## only 3 variables



### Replicating Table A1: Balance checks
(tab.balance <- round (balance[,1:4], 3))

Header1 <- paste ("\\toprule \\toprule & \\multicolumn{1}{c}{large midterm} & \\multicolumn{1}{c}{large concurrent} & \\multicolumn{1}{c}{} & \\multicolumn{1}{c}{} \\\\ \n")
Header2 <- paste ("\\multicolumn{1}{l}{(a) \\emph{Outcome variables (1983)}} & \\multicolumn{1}{c}{mean} & \\multicolumn{1}{c}{mean} & \\multicolumn{1}{c}{difference} & \\multicolumn{1}{c}{$p$-value} \\\\ \\midrule \n")
Header3 <- paste ("[1.25ex] \\multicolumn{5}{l}{(b) \\emph{Pseudo-outcomes (1983)}} \\\\ \\midrule \n")
Header4 <- paste ("[1.25ex] \\multicolumn{5}{l}{(c) \\emph{Electoral outcomes (1983)}} \\\\ \\midrule \n")
Header5 <- paste ("[1.25ex] \\multicolumn{5}{l}{(d) \\emph{Demographics (1980)}} \\\\ \\midrule \n")
Header6 <- paste ("[1.25ex] \\multicolumn{5}{l}{(e) \\emph{Geography and history}} \\\\ \\midrule \n")
Header7 <- paste ("[1.25ex] \\multicolumn{5}{l}{(f) \\emph{Political representation (1983)}} \\\\ \\midrule \n")

addtorow <- list ()
addtorow$pos <- list ()
addtorow$pos[[1]] <- 0
addtorow$pos[[2]] <- 0
addtorow$pos[[3]] <- 6
addtorow$pos[[4]] <- 12
addtorow$pos[[5]] <- 16
addtorow$pos[[6]] <- 19
addtorow$pos[[7]] <- 35
addtorow$command <- c (Header1, Header2
                       , Header3, Header4, Header5, Header6, Header7)
print (xtable ( cbind (vNames2, tab.balance)
                , align=c ("l","l","r","r","r","r")
                , digits=digits-1
                , caption="\\emph{Checking covariate balance}"
                , label="T:balance")
       , sanitize.text.function=function(x){x}
       , floating=TRUE
       , table.placement="t"
       , caption.placement="top" 
       , latex.environments="center"
       , size="footnotesize"
       , include.colnames=FALSE
       , include.rownames=FALSE
       , hline.after = c ()
       , add.to.row=addtorow )



### Replicating Figure 2: Balance checks

## Setting values that we will keep constant when presenting the results
cex.points <- 3.5
cex.text <- 21
col.line <- "gray66"
cex.bars <- 0.045*3
pd <- position_dodge (width = 0.5)
lim.x <- c (0,1)

## building the plot
vNames2b <- sub ("\\\\", replacement="", vNames2b)
vNames2b <- sub ("\\\\", replacement="", vNames2b)
balance$idn.y2 <- factor (nrow (balance):1)
balance$vNames2b <- as.character (vNames2b)
balance <- balance[order (balance$idn.y, decreasing=F),]

(pBalance <- ggplot (balance, aes (x=pval, y=idn.y2))
 + geom_vline (xintercept=c (0.05, 0.10), colour=col.line)
 + geom_point (shape=16, size=cex.points)
 + theme_bw (base_size=cex.text)
 + scale_colour_grey () ## grayscale for the printed version; comment out for colored version
 + scale_x_continuous ("p-value", limits=lim.x)
 + scale_y_discrete ("", labels=as.character (balance$vNames2))
)


## Exporting the plots
(aspect.ratio <- 1.5/1)
plotWidth <- 1000

png ("fig_balance.png", w=plotWidth, h=plotWidth/aspect.ratio)
pBalance
dev.off ()
